An example of treating and ploting data from indeed after scrapping (code in https://github.com/henrique1837/indeed_scraper)
library(plyr)
library(stringr)
library(ggplot2)
library(plotly)
library(leaflet)
#### Read data ####
files <- Sys.glob("./results/*")
files <- files[which(str_detect(string = files,pattern = "indeed")==TRUE)]
df_t <- data.frame()
for(i in 1:length(files)){
df <- read.csv(file = files[i],
stringsAsFactors = FALSE)
df <- df[which(!(str_detect(string = df$date,
pattern = "30+"))),]
file_date <- as.Date(str_extract(string = files[i],
pattern = "[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}"))
df$date <- file_date - as.numeric(str_extract(string = df$date,
pattern = "[[:digit:]]+"))
df$date[which(is.na(df$date))] <- file_date
if(length(df$country) == 0){
df$country <- "USA"
}
df_t <- rbind(df_t,df[which(!(df$link %in% df_t$link)),])
#message("Files ",i," - ",length(files))
}
df_t$city <- gsub(pattern = "[[:punct:]].*",
replacement = "",
x = df_t$location)
df_t$state <- str_extract(string = df_t$location,
pattern = "[A-Z]{2}")
df_t$count <- 1
df_agreggated <- ddply(.data = df_t,
.variables = .(date),
.fun = summarize,
totalJobs=sum(count))
df_companies_date <- ddply(.data = df_t,
.variables = .(date,company),
.fun = summarize,
totalJobs=sum(count))
df_companies <- ddply(.data = df_t,
.variables = .(company),
.fun = summarize,
totalJobs=sum(count))
df_places <- ddply(.data = df_t,
.variables = .(city,state,country),
.fun = summarize,
totalJobs=sum(count))
# Download lat and long of USA cities (source https://simplemaps.com/data/us-cities)
if(!file.exists("./results/USA_lat_long.csv")){
download.file(url = "https://simplemaps.com/static/data/us-cities/uscitiesv1.4.csv",
destfile = "./results/USA_lat_long.csv" )
}
df_lat_long_cities <- read.csv(file = "./results/USA_lat_long.csv",
stringsAsFactors = FALSE)
## Preparing dataframe for leaflet map ##
lats <- numeric()
longs <- numeric()
cities <- character()
totalJobs <- numeric()
df_placesUSA <- df_places[which(df_places$country=="USA"),]
for(i in 1:nrow(df_placesUSA)){
indice <- which(toupper(df_lat_long_cities$city) == toupper(df_placesUSA$city[i]))
if(length(indice)!=0){
if(length(indice)>1){
for(ind in indice){
if(toupper(df_lat_long_cities$state_id[ind]) == toupper(df_placesUSA$state[i])){
indice <- ind
}
}
}
totalJobs[i] <- df_placesUSA$totalJobs[i]
cities[i] <- df_placesUSA$city[i]
lats[i] <- df_lat_long_cities$lat[indice]
longs[i] <- df_lat_long_cities$lng[indice]
}
}
## Warning in lats[i] <- df_lat_long_cities$lat[indice]: número de itens para
## para substituir não é um múltiplo do comprimento do substituto
## Warning in longs[i] <- df_lat_long_cities$lng[indice]: número de itens para
## para substituir não é um múltiplo do comprimento do substituto
df_map <- data.frame(city=cities,
lat=lats,
lng=longs,
totalJobs=totalJobs)
## [1] "2018-12-29" "2019-02-15"
## [1] 247
## [1] 85
## [1] 50
## [1] 43
## Warning in validateCoords(lng, lat, funcName): Data contains 4 rows with
## either missing or invalid lat/lon values and will be ignored
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=pt_PT.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=pt_BR.UTF-8 LC_COLLATE=pt_PT.UTF-8
## [5] LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=pt_PT.UTF-8
## [7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 leaflet_2.0.2 plotly_4.8.0 ggplot2_3.1.0
## [5] stringr_1.3.1 plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 later_0.7.5 pillar_1.3.1
## [4] compiler_3.4.4 bindr_0.1.1 tools_3.4.4
## [7] digest_0.6.18 viridisLite_0.3.0 jsonlite_1.6
## [10] evaluate_0.12 tibble_2.0.1 gtable_0.2.0
## [13] pkgconfig_2.0.2 rlang_0.3.1 shiny_1.2.0
## [16] crosstalk_1.0.0 yaml_2.2.0 xfun_0.4
## [19] withr_2.1.2 dplyr_0.7.8 httr_1.4.0
## [22] knitr_1.21 htmlwidgets_1.3 grid_3.4.4
## [25] tidyselect_0.2.5 glue_1.3.0 data.table_1.12.0
## [28] R6_2.3.0 rmarkdown_1.11 tidyr_0.8.2
## [31] purrr_0.3.0 magrittr_1.5 promises_1.0.1
## [34] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.0
## [37] xtable_1.8-3 mime_0.6 colorspace_1.4-0
## [40] httpuv_1.4.5.1 labeling_0.3 stringi_1.2.4
## [43] lazyeval_0.2.1 munsell_0.5.0 crayon_1.3.4